import math
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pandas_profiling
from sklearn.datasets import load_boston
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import ElasticNet
from sklearn.linear_model import Lasso
from sklearn.linear_model import Ridge
from sklearn.pipeline import make_pipeline
from sklearn.pipeline import FeatureUnion
from sklearn.pipeline import Pipeline
from sklearn.base import BaseEstimator
from sklearn.base import TransformerMixin
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
#conda install -c districtdatalabs yellowbrick --yes
from yellowbrick.regressor import ResidualsPlot
from yellowbrick.regressor import PredictionError
import statsmodels.api as sm
import statsmodels.stats.api as sms
import statsmodels.formula.api as smf
import statsmodels.graphics.regressionplots as regplot
from statsmodels.stats.outliers_influence import variance_inflation_factor
from patsy import dmatrices
boston = load_boston()
print(boston['DESCR'])
bostonDF = pd.DataFrame(boston['data'], columns=boston['feature_names'])
bostonDF['TARGET'] = boston['target']
bostonDF.head()
def ageBin(age):
if age < 30: return 'NEW'
if age < 60: return 'USED'
if age < 80: return 'OLD'
if age < 95: return 'CREAKY'
return 'FOSSIL'
bostonDF['AGE_BIN'] = bostonDF['AGE'].apply(ageBin)
bostonDF['AGE_BIN'].hist()
pandas_profiling.ProfileReport(bostonDF)
class ColumnSelector(BaseEstimator, TransformerMixin):
def __init__(self, columns):
self.columns = columns
def fit(self, X, y=None):
return self
def transform(self, X):
assert isinstance(X, pd.DataFrame)
try:
return X[self.columns]
except KeyError:
cols_error = list(set(self.columns) - set(X.columns))
raise KeyError("The DataFrame does not include the columns: %s" % cols_error)
class TypeSelector(BaseEstimator, TransformerMixin):
def __init__(self, dtype):
self.dtype = dtype
def fit(self, X, y=None):
return self
def transform(self, X):
assert isinstance(X, pd.DataFrame)
return X.select_dtypes(include=[self.dtype])
bostonDF['AGE_BIN'] = bostonDF['AGE_BIN'].astype('category')
bostonDF = bostonDF.drop('AGE', axis=1)
process = make_pipeline(
FeatureUnion(transformer_list=[
("numeric_features", make_pipeline(
TypeSelector(np.number),
StandardScaler()
)),
("categorical_features", make_pipeline(
TypeSelector("category"),
OneHotEncoder(categories='auto', drop='first')
))
])
)
resultsDF = pd.DataFrame(columns=['MAE', 'MSE', 'RMSE'])
compareDF = pd.DataFrame()
df = bostonDF.copy()
y = df['TARGET']
X = df.drop('TARGET', axis=1)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1337)
clf = LinearRegression()
pipe = Pipeline(steps=[('process', process), ('regression', clf)])
pipe.fit(X_train, y_train)
pred = pipe.predict(X_test)
mae = mean_absolute_error(y_test, pred)
mse = mean_squared_error(y_test, pred)
rmse = math.sqrt(mse)
print('MAE = {:.2f}'.format(mae))
print('MSE = {:.2f}'.format(mse))
print('RMSE = {:.2f}'.format(rmse))
resultsDF.loc['OLS'] = [mae, mse, rmse]
compareDF = pd.concat([compareDF, pd.DataFrame(clf.coef_)], axis=1)
errors = abs(y_test - pred)
mape = 100 * (errors / y_test)
accuracy = 100 - np.mean(mape)
print('MAPE : {:.2f}%'.format(np.mean(mape)))
print('Accuracy : {:.2f}%'.format(accuracy))
plt.figure(figsize=(15,10))
visualizer = PredictionError(pipe)
visualizer.fit(X_train, y_train) # Fit the training data to the visualizer
visualizer.score(X_test, y_test) # Evaluate the model on the test data
g = visualizer.poof() # Draw/show/poof the data
plt.figure(figsize=(15,10))
visualizer = ResidualsPlot(pipe)
visualizer.fit(X_train, y_train) # Fit the training data to the model
visualizer.score(X_test, y_test) # Evaluate the model on the test data
visualizer.poof()
tDF = pd.concat([X_train, y_train], axis=1)
f = 'TARGET ~ CRIM + ZN + INDUS + CHAS + NOX + RM + DIS + RAD + TAX + PTRATIO + B + LSTAT + AGE_BIN'
lm = smf.ols(formula=f, data=tDF).fit()
print(lm.summary())
Creates a partial regression subplot for each explanatory variable. Each plot shows the relationship between the response and the given explanatory variable after removing the effect of all other explanatory variables. These plots are helpful in describing the effect of one variable's effect.
fig = plt.figure(figsize=(12,30))
fig = sm.graphics.plot_partregress_grid(lm, fig=fig)
Variance Inflation Factor (VIF) measures the colinearity of predictor variables. VIF is the ratio of the variance of all model's coefficients divided by the variance of a single coefficient if it were fit in isolation.
variables = lm.model.exog
vif = [variance_inflation_factor(variables, i) for i in range(variables.shape[1])]
vifDF = pd.DataFrame()
vifDF['Coefficient'] = lm.params
vifDF['VIF'] = vif
vifDF['Flag'] = np.where(vifDF.VIF > 5, True, False)
vifDF = vifDF.drop('Intercept', axis=0)
vifDF
print(np.corrcoef(df.TAX, df.RAD))
preds = lm.predict(X_train)
plt.scatter(preds, -lm.resid)
plt.title('Residual Plot (Residuals vs Pred)')
plt.ylabel('Residual')
cols = ['Jarque-Bera', 'Chi^2 two-tail prob.', 'Skew', 'Kurtosis']
test = sms.jarque_bera(lm.resid)
pd.DataFrame(zip(cols, test)).T
Leverage
Influence
Watch for:
fig, ax = plt.subplots(figsize=(12,8))
fig = sm.graphics.influence_plot(lm, ax=ax, criterion="cooks")
points = [380, 418]
X_train[X_train.index.isin(points)]
pred = lm.predict(X_train)
plt.title('High Leverage Points (Predicted vs Actual)')
plt.scatter(y_train, pred)
plt.ylabel('Actual')
for pt in points:
plt.scatter(pred.loc[pt], y_train.loc[pt], color='red')
preds = lm.predict(X_train)
plt.scatter(preds, -lm.resid)
plt.ylabel('Residual')
plt.title('Predicted vs Residual')
for pt in points:
plt.scatter(pred.loc[pt], -lm.resid[pt], color='red')
points = [364, 372, 368]
X_train[X_train.index.isin(points)]
pred = lm.predict(X_train)
plt.title('Large Residual Points (Predicted vs Actual)')
plt.scatter(y_train, pred)
plt.ylabel('Actual')
for pt in points:
plt.scatter(pred.loc[pt], y_train.loc[pt], color='red')
preds = lm.predict(X_train)
plt.scatter(preds, -lm.resid)
plt.ylabel('Residual')
plt.title('Predicted vs Residual')
for pt in points:
plt.scatter(pred.loc[pt], -lm.resid[pt], color='red')
preds = lm.predict(X_train)
plt.scatter(preds, -lm.resid)
plt.ylabel('Residual')
plt.title('Predicted vs Residual')
plt.show()
Produces a p-value that when less that a significance level of 0.05, therefore we can reject the null hypothesis that the variance of the residuals is constant and infer that heteroscedasticity is present.
cols = ['Lagrange multiplier statistic', 'p-value', 'f-value', 'f p-value']
test = sms.het_breuschpagan(lm.resid, lm.model.exog)
pd.DataFrame(zip(cols, test)).T
Adding features to Linear Regression will always have the effect of improving R-Squared. To mitigate the chance of overfitting (fitting noise in the data) we want to penalize higher larger coefficients using L1 or L2 Norms. The choice of L1 or L2 has an important effect.
df = bostonDF.copy()
y = df['TARGET']
X = df.drop('TARGET', axis=1)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1337)
clf = Lasso()
pipe = Pipeline(steps=[('process', process), ('regression', clf)])
pipe.fit(X_train, y_train)
pred = pipe.predict(X_test)
mae = mean_absolute_error(y_test, pred)
mse = mean_squared_error(y_test, pred)
rmse = math.sqrt(mse)
print('MAE = {:.2f}'.format(mae))
print('MSE = {:.2f}'.format(mse))
print('RMSE = {:.2f}'.format(rmse))
resultsDF.loc['Lasso'] = [mae, mse, rmse]
compareDF = pd.concat([compareDF, pd.DataFrame(clf.coef_)], axis=1)
hyperDF = pd.DataFrame()
alpha = [0.0, 0.05, 0.25, 0.5, 0.75, 1.0]
cols = []
for a in alpha:
col = 'Alpha-{}'.format(a)
clf = Lasso(alpha=a)
pipe = Pipeline(steps=[('process', process), ('regression', clf)])
pipe.fit(X_train, y_train)
hyperDF[col] = clf.coef_
hyperDF = hyperDF.sort_values('Alpha-0.0')
f, (ax1, ax2, ax3) = plt.subplots(1, 3, figsize=(12,8), sharey=True)
ax1 = hyperDF['Alpha-0.0'].plot.bar(ax=ax1)
ax1.set_title('Lasso - Alpha 0.0')
ax2 = hyperDF['Alpha-0.25'].plot.bar(ax=ax2)
ax2.set_title('Lasso - Alpha 0.25')
ax3 = hyperDF['Alpha-1.0'].plot.bar(ax=ax3)
ax3.set_title('Lasso - Alpha 1.0')
df = bostonDF.copy()
y = df['TARGET']
X = df.drop('TARGET', axis=1)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1337)
clf = Ridge()
pipe = Pipeline(steps=[('process', process), ('regression', clf)])
pipe.fit(X_train, y_train)
pred = pipe.predict(X_test)
mae = mean_absolute_error(y_test, pred)
mse = mean_squared_error(y_test, pred)
rmse = math.sqrt(mse)
print('MAE = {:.2f}'.format(mae))
print('MSE = {:.2f}'.format(mse))
print('RMSE = {:.2f}'.format(rmse))
resultsDF.loc['Ridge'] = [mae, mse, rmse]
compareDF = pd.concat([compareDF, pd.DataFrame(clf.coef_)], axis=1)
hyperDF = pd.DataFrame()
alpha = [0.0, 0.05, 0.25, 0.5, 0.75, 1.0, 500.0]
cols = []
for a in alpha:
col = 'Ridge-{}'.format(a)
clf = Ridge(alpha=a)
pipe = Pipeline(steps=[('process', process), ('regression', clf)])
pipe.fit(X_train, y_train)
hyperDF[col] = clf.coef_
hyperDF = hyperDF.sort_values('Ridge-0.0')
f, (ax1, ax2, ax3) = plt.subplots(1, 3, figsize=(12,8), sharey=True)
ax1 = hyperDF['Ridge-0.0'].plot.bar(ax=ax1)
ax1.set_title('Ridge - Alpha 0.0')
ax2 = hyperDF['Ridge-1.0'].plot.bar(ax=ax2)
ax2.set_title('Ridge - Alpha 1.0')
ax3 = hyperDF['Ridge-500.0'].plot.bar(ax=ax3)
ax3.set_title('Ridge - Alpha 500.0')
fig, ax = plt.subplots(figsize=(10,8))
compareDF.columns = ['OLS', 'Lasso', 'Ridge']
compareDF = compareDF.sort_values('OLS')
plt.title('Comparison of Model Components')
plt.ylabel('Coefficient Value')
compareDF[['OLS', 'Ridge', 'Lasso']].plot.bar(ax=ax)
df = bostonDF.copy()
y = df['TARGET']
X = df.drop('TARGET', axis=1)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1337)
for l1 in np.arange(0.1, 1, 0.1):
clf = ElasticNet(l1_ratio=l1)
pipe = Pipeline(steps=[('process', process), ('regression', clf)])
pipe.fit(X_train, y_train)
pred = pipe.predict(X_test)
mae = mean_absolute_error(y_test, pred)
mse = mean_squared_error(y_test, pred)
rmse = math.sqrt(mse)
classifier = 'ElasticNet - L1={:.1f} L2={:.1f}'.format(l1, 1.0 - l1)
print ('{} MAE {:.3f} MSE {:.3f} RMSE {:.3f}'.format(classifier, mae, mse, rmse))
resultsDF.loc[classifier] = [mae, mse, rmse]
resultsDF.sort_values('RMSE')
resultsDF.sort_values('MAE')